library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(here)
## here() starts at C:/Users/Howard/Documents/GitHub/CodeClan/pda_dirty_data
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(stringr)
library(assertr)
library(testthat)
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
## 
##     matches
## The following object is masked from 'package:purrr':
## 
##     is_null
## The following objects are masked from 'package:readr':
## 
##     edition_get, local_edition
## The following object is masked from 'package:tidyr':
## 
##     matches
library(readxl)

1 MVP 1.1 Task 1 - Decathlon Data This data is contained in the .rds file decathlon.rds. You’ll need to use read_rds() from readr to open it.

1 Call R file to process the rds file.

source(here::here("R_scripts/read_rds_data.R"))
decathlon_data  

2 1.1.2 Analysis questions

## 1 Who had the longest long jump seen in the data?

decathlon_data %>% 
  filter(long_jump == max(long_jump))

## 2 What was the average 100m time in each competition?

decathlon_data %>% 
  group_by(competition) %>%
   summarise(mean(x100m))

2.1 3 Who had the highest total points across both competitions?

decathlon_data %>% 
  filter(points== max(points))

## 4 What was the shot-put scores for the top three competitors in each competition?

rbind(decathlon_data %>% 
  filter(competition == "olympicg") %>%
  select(competition,competitor,shot_put) %>%
  group_by(competition, competitor) %>%
  arrange(desc(shot_put)) %>%
  head(3), 
  decathlon_data %>% 
  filter(competition == "decastar") %>%
  select(competition,competitor,shot_put) %>%
  group_by(competition, competitor) %>%
  arrange(desc(shot_put)) %>%
  head(3))

2.2 5 What was the average points for competitors who ran the 400m in less than 50 seconds vs. those than ran 400m in more than 50 seconds?

rbind(decathlon_data  %>%
    filter(x400m < 50) %>%
    summarise(round(mean(points),2 )) %>%
    distinct() %>%
    paste( " average points where 400m < 50 secs"),
  decathlon_data  %>%
    filter(x400m >= 50) %>%
    summarise( round(mean(points),2)) %>%
    distinct() %>%
    paste( " average points where 400m >= 50 secs")
)
##      [,1]                                           
## [1,] "8120.48  average points where 400m < 50 secs" 
## [2,] "7727.17  average points where 400m >= 50 secs"

3 1.4 Task 4 - Halloween Candy Data

The data is in files boing-boing-candy-2015.xlxs, boing-boing-candy-2016.xlxs and boing-boing-candy-2017.xlxs. Bear in mind that this is trickier compared with tasks 1, 2 & 3.

source(here::here("R_scripts/read_in_candy_data.R"))
## Warning: Unknown or uninitialised column: `which_country_do_you_live_in`.
## Warning in eval(ei, envir): NAs introduced by coercion
## Warning in eval(ei, envir): NAs introduced by coercion to integer range
## Warning in eval(ei, envir): NAs introduced by coercion

## Warning in eval(ei, envir): NAs introduced by coercion
## Warning in eval(ei, envir): NAs introduced by coercion to integer range

3.1 Output to show valid which_country_do_you_live_in data

2015 was populated using 2016 residence data and looping to populate
the larger data set.
candy_data_3 %>%
  group_by(which_country_do_you_live_in)  %>%
  count() %>%
  arrange(desc(n))

4 Now the three years of data need to be merged.

source(here::here("R_scripts/drop_cols_and_merge.R"))
candy_data